Load pacakges

library(readr)
library(tidyverse)
library(ggplot2)
library(ggthemes)
library(grid)
library(gridExtra)
library(DT)

Prepare data

# load data
df <- read_csv("df.csv")

# define data type
df %>%
  mutate(
    user_id = as.factor(user_id),
    tweet_id = as.factor(tweet_id),
    friend_id = as.factor(account_id)
  ) %>%
  dplyr::select(-account_id) -> df

# what is the maximum or last number of user_friends_count? 
df %>% 
  dplyr::select(user_id, user_friends_count) %>% 
  distinct() %>% 
  group_by(user_id) %>% 
  mutate(max_friends_count = max(user_friends_count)) %>% 
  dplyr::select(-user_friends_count) %>% 
  distinct() -> max_data
# max_data: user_id - max_friends_count 

# merge this 'max_data' into df 
df %>% 
  merge(max_data, by="user_id") -> df
 
# define x-axis: number of tweets collected during one week
df %>%
  group_by(user_id) %>%
  count(tweet_id) %>%
  mutate(
    x = cumsum(n)
  ) %>%
  dplyr::select(
    user_id, tweet_id, x
  ) -> df_for_x

df %>%
  inner_join(df_for_x, by=c("user_id", "tweet_id")) -> df 

# define y-axis: count how many distinct accounts are in the tweets (numerator) 
# make a fraction for y-axis (max_friends_count as denominator)

df %>% 
  arrange(user_id,desc(-x)) %>%
  group_by(user_id) %>%
  mutate(
    numerator = cumsum(!duplicated(friend_id)),
    y = numerator / max_friends_count
  ) -> df2 

# df2 is the final data for drawing plots 

Plot 1.

# plot 1. 
df2 %>%
  group_by(user_id) %>%
  ggplot(aes(x=x, y=y, col=user_id)) +
  geom_point(alpha=0.5) +
  theme_few() + 
  theme(legend.position="none") +
  xlab("Number of Tweets Collected") +
  ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
  scale_x_continuous(n.breaks = 10) 

Plot 2. Grid by User

In the data frame (df2), there are 60 unique users.

df2 %>%
  ggplot(aes(x=x, y=y, col=user_id)) +
  geom_point(alpha=0.5) +
  theme_few() + 
  theme(legend.position="none") +
  xlab("Number of Tweets Collected") +
  ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
  scale_x_continuous(n.breaks = 10) +
  facet_wrap(~user_id, ncol = 6, scales="fixed")

Plot 3 & Plot 4

Let’s redraw the plot but by separating the samples into smaller chunks. I also allow scales of the x-axis to vary for each user.

df2 %>% 
  mutate(numeric_user_id = as.integer(user_id)) %>%
  filter(numeric_user_id < 31) %>% # from 1~30
  ggplot(aes(x=x, y=y, col=user_id)) +
  geom_point(alpha=0.5) +
  theme_few() + 
  theme(legend.position="none") +
  xlab("Number of Tweets Collected") +
  ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
  scale_x_continuous(n.breaks = 5) +
  facet_wrap(~user_id, nrow=10, ncol=6, scales="free_x") +
  ggtitle("First 30 users")

df2 %>% 
  mutate(numeric_user_id = as.integer(user_id)) %>%
  filter(numeric_user_id > 30) %>% # from 31~60
  ggplot(aes(x=x, y=y, col=user_id)) +
  geom_point(alpha=0.5) +
  theme_few() + 
  theme(legend.position="none") +
  xlab("Number of Tweets Collected") +
  ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
  scale_x_continuous(n.breaks = 5) +
  facet_wrap(~user_id, nrow=10, ncol=6, scales="free_x") +
  ggtitle("Second 30 users")

Plot 5. Distribution of Friends Count

It seems some people follow very few accounts. Let’s check distribution of the friends count.

df2 %>%
  distinct(user_id, max_friends_count) %>% 
  ggplot(aes(max_friends_count)) +
  geom_histogram(bins=80) + theme_few() + 
  xlab("Friends Count (# of friends)") +
  ylab("Frequency (# of users)") +
  ggtitle("Histogram of Friends Count") -> d1

df2 %>%
  distinct(user_id, max_friends_count) %>%
  ggplot(aes(max_friends_count)) +
  geom_histogram(bins=80) +
  theme_few() + 
  xlab("(Log) Friends Count (# of friends)") +
  ylab("Frequency (# of users)") +
  ggtitle("Histogram of (Log) Friends Count") + 
  scale_x_log10(n.breaks=10, label = scales::label_number(accuracy = 1)) -> d2


df2 %>%
    distinct(user_id, max_friends_count) %>%
    ggplot(aes(x=max_friends_count)) +   
    geom_boxplot(outlier.color = 'red', outlier.shape=8) +
    scale_y_discrete( ) + 
    theme_few() + xlim(c(0, 5000)) +
    labs(title = "Boxplot of Friends Count",
         x = "Friends Count", y = "") -> d3

df2 %>%
  distinct(user_id, max_friends_count) %>%
  mutate(log_friends_count = log(max_friends_count)) %>%
  ggplot(aes(x=log_friends_count)) +   
  geom_boxplot(outlier.color = 'red', outlier.shape=8) +
  scale_y_discrete( ) + 
  xlim(c(0, 10)) +
  theme_few() +
  labs(title = "Boxplot of (Log) Friends Count",
       x = "(Log) Friends Count",
       y = " ") -> d4

grid.arrange(d1, d2, d3, d4, ncol=2) 

df2 %>%
  distinct(user_id, max_friends_count) %>% 
  arrange(-desc(max_friends_count)) -> table_dta

datatable(table_dta, 
          caption = "Print User-Friends Count by Ascending Order",
          filter="top")

With these users with very few friends in mind, let’s draw aggregated plots.

Plot 6. Aggregate plot

# aggregate plot: mean of y-axis by each point of x 
df2 %>%
  group_by(x) %>%
  summarize(y = mean(y)) %>%
  ungroup() %>%
  ggplot(aes(x=x, y=y)) +
  geom_point(alpha=0.5) + 
  geom_smooth(color='darkcyan', linewidth=0.5) + 
  theme_few() + 
  theme(legend.position="none") +
  xlab("Number of Tweets Collected") +
  ylab("Mean of distinct accounts / # of friends (%)") +
  scale_x_continuous(n.breaks = 10) -> ag1

ag1
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Plot 7. Aggregate plot

# aggregate plot: mean of y-axis by each point of x 
df2 %>%
  group_by(x) %>%
  summarize(y = mean(y)) %>%
  ungroup() %>%
  ggplot(aes(x=x, y=y)) +
  geom_point(alpha=0.5) + 
  geom_smooth(color='darkcyan', linewidth=0.5) + 
  theme_few() + 
  theme(legend.position="none") +
  xlab("(Log) Number of Tweets Collected") +
  ylab("Mean of distinct accounts / # of friends (%)") +
  scale_x_log10(n.breaks=10, label = scales::label_number(accuracy = 1)) -> ag2

ag2
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Plot 8 & Plot 9

What happens to Plot 6 and Plot 7 if I remove the ones with very few friends (as seen in Plot 5)?

df2 %>%
  # remove users with 5 friends or fewer 
  filter(user_id != "1615794190662701056" & user_id !="1429638571481309184" & user_id != "777808023938928640") %>% 
  group_by(x) %>%
  summarize(y = mean(y)) %>%
  ungroup() %>%
  ggplot(aes(x=x, y=y)) +
  geom_point(alpha=0.5) + 
  geom_smooth(color='darkcyan', linewidth=0.5) + 
  theme_few() + 
  theme(legend.position="none") +
  xlab("Number of Tweets Collected") +
  ylab("Mean of distinct accounts / # of friends (%)") +
  scale_x_continuous(n.breaks = 10) +
  geom_vline(xintercept = 5000, linetype=2, color="red", alpha=0.5) +
  geom_vline(xintercept = 15000, linetype=3, color="blue", alpha=0.5) -> ag3

ag3
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

df2 %>%
  filter(user_id != "1615794190662701056" & user_id !="1429638571481309184" & user_id != "777808023938928640") %>% 
  group_by(x) %>%
  summarize(y = mean(y)) %>%
  ungroup() %>%
  ggplot(aes(x=x, y=y)) +
  geom_point(alpha=0.5) + 
  geom_smooth(color='darkcyan', linewidth=0.5) + 
  theme_few() + 
  theme(legend.position="none") +
  xlab("(Log) Number of Tweets Collected") +
  ylab("Mean of distinct accounts / # of friends (%)") +
  scale_x_log10(n.breaks=10, label = scales::label_number(accuracy = 1)) -> ag4

ag4
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Let’s bind plots 6~9 together to faciliate comparison:

grid.arrange(ag1, ag2, ag3, ag4, ncol=2)